For our project, we used the StackOverFlow open questionnaires. Questionnaires are presented in the form of csv-files for each year separately from 2011 to 2019. Each year, the questionnaire is modified: some questions are added or removed, questions and possible answers also undergo changes.
The aim of the project is to analyze the questionnaires for all the years and find something interesting. :) In order to proceed to the analysis, it is necessary to carry out pre-processing, namely, to bring the questionnaires for all years (with different questions and different answer options) to to general view.
We import the necessary libraries for work.
import turicreate as tc
from turicreate import SFrame
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import warnings
import plotly.graph_objects as go
warnings.filterwarnings('ignore')
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
db_path='research/'
file_names=os.listdir(db_path)
def get_schema(file):
for i in enumerate(file.columns):
print(i)
First, let's look at the size of our questionnaires.
df2011=pd.read_csv('research/2011.csv', header=0,encoding = "latin", error_bad_lines=False)
df2012=pd.read_csv('research/2012.csv', header=0,encoding = "latin", error_bad_lines=False)
df2013=pd.read_csv('research/2013.csv', header=0,encoding = "latin", error_bad_lines=False)
df2014=pd.read_csv('research/2014.csv', header=0,encoding = "latin", error_bad_lines=False)
df2015=pd.read_csv('research/2015.csv', header=0,encoding = "latin", error_bad_lines=False)
df2016=pd.read_csv('research/2016.csv', header=0,encoding = "latin", error_bad_lines=False)
df2017=pd.read_csv('research/2017.csv', header=0,encoding = "latin", error_bad_lines=False)
df2018=pd.read_csv('research/2018.csv', header=0,encoding = "latin", error_bad_lines=False)
df2019=pd.read_csv('research/2019.csv', header=0,encoding = "latin", error_bad_lines=False)
df=[df2011,df2012,df2013,df2014,df2015,df2016,df2017,df2018,df2019]
year=2011
for i in df:
print (f'Year:{year}, Rows: {i.shape[0]}, Columns: {i.shape[1]}')
year+=1
The values of the columns correspond to various questions, and the rows - to the answers of the respondents. As we see, we have a huge number of questions, which necessitates careful processing.
In a detailed review of each questionnaire separately, we are faced with the fact that we need the data are located in different columns. For example, education data is scattered in different columns (as different answer options). In order to combine this data into one column, we write a function.
def get_info4columns(df,new_name_column,first_column,last_column=None):
if last_column!=None:
last_column+=1
time_list=[]
for row in range(len(df)):
data=list(df.iloc[row,first_column:last_column].dropna())
d=[]
for i in data:
if len(i)>1:
d.append(i)
time_list.append(d)
df[new_name_column]=time_list
return df
We are faced with the fact that the answers variants about education varied from year to year. We need to bring all variants of answers for all years to a general form. Let's write a function.
def get_education4normal(x):
s=0
taught=0
online=0
education=[]
master=0
bac=0
doc=0
try:
for i in x:
if 'Mathematics'in i or 'Computer' in i or 'Information' in i or 'Management' in i or 'Web' in i:
s+=1
if 'taught' in i or 'Taught' in i:
taught+=1
if 'Master' in i or 'Professional' in i:
master+=1
if 'Bachelor' in i or 'Associate' in i or 'B.' in i:
bac+=1
if 'Doctoral ' in i or 'PhD' in i or 'doctoral' in i:
doc+=1
if 'Online' in i or 'online' in i:
education.append('Online class')
online+=1
except:
return x
if s==0 and online>0:
return 'Online class'
if s==0 and online==0 and taught>0:
return 'Self-taught'
if s>0:
if doc>=1:
education.append('Doctor in Computer Science')
if master>=1 and doc==0:
education.append('Master in Computer Science')
if bac>=1 and doc==0 and master==0:
education.append('Bachelor in Computer Science')
if len(education)>0:
return ' + '.join(education)
else:
return None
def clear_list(x):
a=[]
for i in x:
if ';' in i:
i=i.split(';')
for u in i:
a.append(u)
else:
a.append(i)
return a
Let's create a new DataFrame with the columns we need for research. We will conduct pre-processing for each questionnaire separately and write the results to our new DataFrame.
df_all=pd.DataFrame({'Year': [], 'Country': [], 'Age': [],'Gender': [], 'Salary': [],'Education': [],'Skills': [],'IDE': [],})
for file in file_names:
path=f'{db_path}{file}'
year=file.split('.')[0]
if '2015' in file:
test=pd.read_csv(path, header=1,encoding = "latin", error_bad_lines=False)
else:
test=pd.read_csv(path, encoding = "latin", error_bad_lines=False)
if '2011' in file:
test=test.iloc[:,[0,2,45]]
test.columns=['Country','Age','Salary']
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
if '2012' in file:
test=test.iloc[:,[0,2,39]]
test.columns=['Country','Age','Salary']
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
if '2013' in file:
test=test.iloc[:,[0,2,100]]
test.columns=['Country','Age','Salary']
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
if '2014' in file:
test=test.iloc[:,[0,3,4,5,6,7,42,43,44,45,46,47,48,49,50,51,52,55,56,57,58,59,60,61,62,63,64,65,66,86]]
test.drop([0], inplace=True)
test=get_info4columns(test,'Skills',6,28)
test=test.drop(test.columns[[6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]],axis=1)
test.columns=['Country','Age','Gender','IT_Expirience','Occupation','Salary','Skills']
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
if '2015' in file:
rep=[i for i in range(8,50)]
repp=[0,1,2,4,5,106,97,101,102,103,188,189]
repp=repp+rep
test=test.iloc[:,repp]
test=get_info4columns(test,'Skills',12)
rep=[i for i in range(12,54)]
test=test.drop(test.columns[rep],axis=1)
test=get_info4columns(test,'educ1',6,9)
test=test.drop(test.columns[[6,7,8,9]],axis=1)
test.columns=['Country','Age','Gender','IT_Expirience','Occupation','Salary','IDE','2','Skills','Education']
test=test.drop(columns=['2'])
test['Year']=int(year)
test['Education']= test['Education'].apply(lambda x:get_education4normal(x))
df_all=df_all.append(test, ignore_index = True)
if '2016' in file:
repp=[2,6,7,12, 9, 14, 16, 37,28]
test=test.iloc[:,repp]
test.columns=['Country','Age','Gender','IT_Expirience','Occupation','Salary','Skills','Education','IDE']
test['Year']=int(year)
test['Skills']=test['Skills'].str.split(pat=';')
test['Education']=test['Education'].str.split(pat=';')
test['IDE']=test['IDE'].str.split(pat=';')
test['Education']= test['Education'].apply(lambda x:get_education4normal(x))
df_all=df_all.append(test, ignore_index = True)
if '2017' in file:
repp=[3,145,6,7,83,11,14,15,16,17,152,88,90,92,96]
test=test.iloc[:,repp]
test=get_info4columns(test,'Education2',2,last_column=4)
test=get_info4columns(test,'Occup',6,last_column=9)
test=get_info4columns(test,'Skills',11,last_column=13)
test['Skills']=test['Skills'].apply(lambda x: clear_list(x))
test=test.drop(test.columns[[2,3,4,6,7,8,9,11,12,13]],axis=1)
test['IDE']=test['IDE'].str.split(pat=';')
test.columns=['Country','Gender','IT_Expirience','Salary','IDE','Education','Occupation','Skills']
test['Year']=int(year)
test['Education']= test['Education'].apply(lambda x:get_education4normal(x))
df_all=df_all.append(test, ignore_index = True)
if '2018' in file:
repp=[3,6,7,58,122,120,124,9,10,52,65,67,71,73]
test=test.iloc[:,repp]
test['IDE']=test['IDE'].str.split(pat=';')
test=get_info4columns(test,'Skills',10,last_column=12)
test['Skills']=test['Skills'].apply(lambda x: clear_list(x))
test=test.drop(test.columns[[4,10,11,12]],axis=1)
test=get_info4columns(test,'Education',1,last_column=3)
test['Education']= test['Education'].apply(lambda x:get_education4normal(x))
test=test.drop(test.columns[[1,2,3]],axis=1)
test['Occupation']=test['DevType'].str.split(pat=';')
test=test.drop(test.columns[[3]],axis=1)
test=test.rename(columns={"YearsCoding": "IT_Expirience"})
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
if '2019' in file:
repp=[6,8,9,10,12,13,43,45,49,51,53,77,78,31]
test=test.iloc[:,repp]
test['IDE']=test['DevEnviron'].str.split(pat=';')
test=get_info4columns(test,'Skills',6,last_column=9)
test['Skills']=test['Skills'].apply(lambda x: clear_list(x))
test=get_info4columns(test,'Education',1,last_column=3)
test['Education']= test['Education'].apply(lambda x:get_education4normal(x))
test['Occupation']=test['DevType'].str.split(pat=';')
test=test.drop(test.columns[[1,2,3,4,6,7,8,9,10]],axis=1)
test=test.rename(columns={"YearsCode": "IT_Expirience"})
test=test.rename(columns={"ConvertedComp": "Salary"})
test['Year']=int(year)
df_all=df_all.append(test, ignore_index = True)
df_all=df_all.sort_values(by=['Year'],ascending=False)
df_all
After receiving a single DataFrame, we still need to bring the values of each column to a general form. As we have done above with the education column. Let's start with the age.
def age_category(x):
if x==None:
return None
if '20' in x:
return 'young <25'
else:
return ' old >25'
def age_rank(x):
a='< 20'
y='20 - 24'
z='25 - 34'
r='35 - 44'
o='45 - 60'
q='> 60'
if '18 - 24 years old' in str(x):
return y
if '-' in str(x):
x=x.split('-')[0]
x=x.strip()
try:
x=int(x)
if x<20:
return a
if 20<=x<=24:
return y
if 25<=x<=34:
return z
if 35<=x<=44:
return r
if 45<=x<=60:
return o
if x>60:
return q
except:
if '65 years or older' in str(x) or '>' in str(x):
return q
if '<' in str(x) or 'Under 18 years old' in str(x):
return a
else:
return None
df_all['Age_range'] = df_all['Age'].apply(lambda x: age_rank(x))
df_all['Age_category'] = df_all['Age_range'].apply(lambda x: age_category(x))
df_all['Age_range'].unique()
Let's put the countries in order.
def clean_country(country):
try:
if country =='United States':
return 'United States of America'
if 'Ireland' in country :
return 'Ireland'
if country =='Other (please specify)':
return 'Other'
if country =='Korea South':
return 'South Korea'
if country =='Republic of Korea':
return 'South Korea'
if country =='Viet Nam':
return 'Vietnam'
if country =='Australasia':
return 'Australia'
if 'Congo' in country:
return 'Congo'
if 'Hong Kong' in country:
return 'Hong Kong'
if 'Iran' in country:
return 'Iran'
if 'Russia' in country:
return 'Russian Federation'
else:
return country
except:
return None
df_all['Country']=df_all['Country'].apply(lambda x:clean_country(x) )
Let's put the salaries in order.
def get_range_salary(salary,mid=0):
if '-' in str(salary):
salary=str(salary).split('-')[0]
if '$' in str(salary):
salary=str(salary).split('$')[1]
if ',' in str(salary):
salary=str(salary).replace(',','')
if mid==0:
a='0'
b='<20.000'
c='$20.000 - $40.000'
d='$40.000 - $60.000'
e='$60.000 - $80.000'
f='$80.000 - $100.000'
j='$100.000 - $120.000'
h='$120.000 - $140.000'
i='>$140.000'
if mid==1:
a=0
b=10000
c=30000
d=50000
e=70000
f=90000
j=110000
h=130000
i=150000
try:
salary=float(salary)
if (salary)==0:
return a
if (salary)<20000:
return b
if 20000<=(salary)<40000:
return c
if 40000<=(salary)<60000:
return d
if 60000<=(salary)<80000:
return e
if 80000<=(salary)<100000:
return f
if 100000<=(salary)<120000:
return j
if 120000<=(salary)<140000:
return h
if 140000<=(salary):
return i
except ValueError:
if 'Student ' in str(salary):
return a
else:
return None
df_all['Salary Range']=df_all['Salary'].apply(lambda x:get_range_salary(x))
df_all['Salary Midpoint']=df_all['Salary'].apply(lambda x:get_range_salary(x,1))
df_all['Salary Range'].unique()
We perform a similar operation, which was above, on the Occupation column.
def get_occupation(x):
if isinstance(x,list):
try:
a= ",".join(x)
except:
a= None
else:
a= x
if 'ata scientist' in str(a) or 'Machine' in str(a) or 'nalyst' in str(a) or 'intelligence' in str(a):
return 'Data scientist or machine learning specialist'
elif 'ack-end' in str(a) or 'Back-End' in str(a):
return 'Back-end Developer'
elif 'ront-end' in str(a) or 'Front-End' in str(a) or 'raphics' in str(a):
return 'Front-end Developer'
elif 'ull-stack' in str(a) or 'Full' in str(a) or 'Web developer' in str(a):
return 'Full-stack Developer'
elif 'obile' in str(a):
return 'Mobile Developer'
elif 'DevOps' in str(a):
return 'DevOps'
elif 'esktop' in str(a) or 'pplication' in str(a) or 'Enterprise' in str(a):
return 'Desktop application developer'
elif 'atabase' in str(a) or 'Engineer' in str(a):
return 'Database administrator'
elif 'dministrator' in str(a):
return 'Systems administrator'
elif 'anager' in str(a) or 'xecutive' in str(a) or 'sales' in str(a):
return 'Manager of Developers or Team Leader'
elif 'Quality' in str(a) or 'QA' in str(a):
return 'Quality assurance engineer'
elif 'eveloper' in str(a):
return 'Full-stack Developer'
elif 'esigner' in str(a):
return 'Designer'
elif 'cademic' in str(a) or 'cientist' in str(a) or 'Educator' in str(a):
return 'Academic researcher'
elif 'Student' in str(a):
return 'Student'
else:
return "Other"
df_all['Occupation']=df_all['Occupation'].apply(lambda x:get_occupation(x))
df_all['Occupation'].unique()
Now it's the Gender column turn.
def clear_gender(x):
x=str(x)
if ';' in x:
x=x.split(';')[0]
x=x.strip()
if 'Man' in x:
x='Male'
if 'Woman' in x:
x='Female'
if 'non-conforming' in x:
x='Other'
if 'nan' in x:
x='Prefer not to disclose'
return x
df_all['Gender']=df_all['Gender'].apply(lambda x:clear_gender(x))
df_all['Gender'].unique()
Finally, let's process the experience column.
def expirience_range(expirience):
a = '0-2'
b = '3-5'
c = '6-10'
d = '11+'
expirience=str(expirience)
expirience=expirience.split('to')[0].strip()
if '2 - 5 years' in expirience:
return b
if 'Less' in expirience or '<' in expirience:
return a
if 'ore' in expirience or '+' in expirience:
return d
if '6/10/2014' in expirience :
return c
if '2/5/2014' in expirience:
return b
expirience=expirience.split('-')[0].strip()
x = float(expirience)
if x<3:
return a
if 3<=x<=5:
return b
if 6<=x<=10:
return c
if x>=11:
return d
else:
return None
df_all['IT_Expirience']=df_all['IT_Expirience'].apply(lambda expirience:expirience_range(expirience))
df_all['IT_Expirience'].unique()
df_all
Now we have a DataFrame suitable for further analysis.
graf_count_year=df_all.groupby(['Year']).size().reset_index(name='counts')
plt.figure(figsize = (10, 8), facecolor = None)
ax = sns.lineplot(x='Year', y='counts', data=graf_count_year)
plt.xlabel("Year")
plt.ylabel("Count")
From the graph we see that after 2014 the number of participants begins to grow every year more and more.
graf_count_age=df_all.groupby(['Year','Age_range']).size().reset_index(name='counts')
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="counts", hue="Age_range", data=graf_count_age)
plt.xlabel("Year")
plt.ylabel("Count")
After analyzing the graph of age ranks, we also see that all changes begin to occur after 2014. In the future, it will be logical to carefully consider only the years from 2014 to 2019. It can also be seen that the main number of specialists falls on the age from 20 to 45. The most actively growing specialist category is from 20 to 25 years.
graf_count_educ=df_all.groupby(['Year','Education']).size().reset_index(name='counts')
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="counts", hue="Education", data=graf_count_educ)
plt.xlabel("Year")
plt.ylabel("Count")
From this graph, we see that the largest number of specialists has a bachelor's degree. Since 2017, the number of specialists who have received online education has begun to grow, and to date, this type of education has been very popular. Perhaps this is due to the growth of high-quality online resources. For example, Coursera presents courses from prestigious higher education institutions from around the world in different languages. It can also be noted that the number of specialists with a doctoral education is a very low part for the entire time, which is quite logical.
graf_salary_educ=df_all.groupby(['Year','Education'], as_index=False)['Salary Midpoint'].mean()
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="Salary Midpoint", hue="Education", data=graf_salary_educ)
plt.xlabel("Year")
plt.ylabel("Salary Average")
From the graph of the dependence of education on average wages, we see that the highest wages belong to the PhDs, and there is a significant difference compared to the other categories. They are followed by bachelors, but in recent years we have seen an increase in salaries for self-taught people and online classes. In 2019, their salaries compete with bachelor's salaries. What is quite an interesting phenomenon.
graf_count_ocup=df_all.groupby(['Year','Occupation']).size().reset_index(name='counts')
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="counts", hue="Occupation",palette='muted' ,data=graf_count_ocup)
plt.xlabel("Year")
plt.xlim(2014)
plt.ylabel("Count")
Consider the number of employees in different positions. It is interesting to note that we see a sharp increase among the back-end developers, followed by the category of full-stack developers (which is obvious, as these are two very close positions). All other posts are moderately distributed over the years. Perhaps this is due to salaries in this area. Now we will check it.
df_occup_salary=df_all.groupby(['Year','Occupation'], as_index=False)['Salary Midpoint'].mean()
sns.set()
plt.figure(figsize = (16, 12), facecolor = None)
ax = sns.lineplot(x="Year", y="Salary Midpoint", hue="Occupation",palette='muted' ,data=df_occup_salary)
plt.xlabel("Year")
plt.ylabel("Salary Midpoint")
plt.xlim(2014)
From the graph, we see that the leaders of the teams earn the most. In recent years, database administrators have been catching up with them (perhaps they are Data Scientists). Devops are also among the well-paid professionals. After them, the salaries of full-stack and back-end developers, as we see, these are not the highest paying positions.
df_occup_salary=df_all[df_all['Year']==2019]
df_occup_salary=df_occup_salary[['Occupation','Salary Midpoint','Age_category']].dropna()
fig, ax =plt.subplots(figsize=(10,10))
sns.boxenplot(y="Occupation", x="Salary Midpoint", hue="Age_category",
data=df_occup_salary)
In all positions, young professionals earn less. This situation is better only on the positions of team leaders and database administrators (As pointed out above, data scientists).
Let's try to consider the occupation in terms of education.
def get_educ(x):
try:
if 'Master' in x:
return 'Master in Computer Science'
elif 'Doctor' in x:
return 'Doctor in Computer Science'
elif 'Bachelor ' in x:
return 'Bachelor in Computer Science'
elif 'Other' in x or 'Online' in x or 'Self' in x:
return 'Not Educated'
except:
return None
df_occup_educ=df_all.copy()
df_occup_educ['Education']=df_occup_educ['Education'].apply(lambda x:get_educ(x))
df_occup_educ=df_occup_educ.groupby(['Education','Occupation']).size().reset_index(name='counts')
df_occup_educ
df_occup_educ=df_occup_educ.pivot(index='Education', columns='Occupation', values='counts')
df_occup_educ
a1=df_occup_educ.values[0]
a2=df_occup_educ.values[1]
a3=df_occup_educ.values[2]
a4=df_occup_educ.values[3]
a=list(zip(a1, a2, a3,a4))
b=df_occup_educ.columns
results=dict(zip(b,a))
results
category_names = ['Bachelor in Computer Science', 'Doctor in Computer Science',
'Master in Computer Science', 'Not Educated']
def survey(results, category_names):
labels = list(results.keys())
data = np.array(list(results.values()))
data_cum = data.cumsum(axis=1)
category_colors = plt.get_cmap('RdYlGn')(
np.linspace(0.15, 0.85, data.shape[1]))
fig, ax = plt.subplots(figsize=(15, 7))
ax.invert_yaxis()
ax.xaxis.set_visible(False)
ax.set_xlim(0, np.sum(data, axis=1).max())
for i, (colname, color) in enumerate(zip(category_names, category_colors)):
widths = data[:, i]
widths=widths
starts = data_cum[:, i] - widths
ax.barh(labels, widths, left=starts, height=1,
label=colname, color=color)
xcenters = starts + widths / 2
r, g, b, _ = color
text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
for y, (x, c) in enumerate(zip(xcenters, widths)):
ax.text(x, y, str(int(c)), ha='center', va='center',
color=text_color)
ax.legend(ncol=len(category_names), bbox_to_anchor=(0, 1),
loc='lower left', fontsize='small')
return fig, ax
survey(results, category_names)
From the graph, we see that about 30% of specialists from the back-end and full-stack areas work without education. We also see that in Hi-Tech we can meet people without education in a sufficiently large number.
plt.figure(figsize = (16, 8), facecolor = None)
sns.violinplot(x='Year', y='Salary Midpoint',hue='Age_category',split=True,data=df_all)
We observe that in recent years, young professionals more and more achieve success and receive the highest salaries in IT. Although in the past years in this category there were only older representatives.
Let's make a rating of the countries with the highest salaries.
df_all=df_all[df_all['Year']>2013]
df_country=df_all.groupby(['Year','Country'], as_index=False).agg({'Salary Midpoint': ['mean','size']})
df_country.columns=list(map(' '.join,df_country.columns.values))
df_country=df_country[df_country['Salary Midpoint size']>30]
df_country=df_country.dropna()
df_country.sort_values(by=['Salary Midpoint size'],ascending=False)
df_country.columns=['Year','Country','Salary_mean','People_count']
df_country
px.choropleth(df_country, locations='Country', locationmode='country names',animation_frame="Year", color='Salary_mean')
We will display the countries on the map and color them according to the salaries.
df_country_top=df_country.groupby(['Country'], as_index=False)['Salary_mean'].mean()
df_country_top=df_country_top.sort_values(by=['Salary_mean'],ascending=False)
top12=df_country_top['Country'].array[:12]
df_filter = df_country['Country'].isin(top12)
df_top12=df_country[df_filter]
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Country', x='Salary_mean',orient="h", data=df_country_top[:12], ax=ax)
We received the top 12 high-salary countries, and we see that Israel is on this list.
df_top12=df_top12[['Year','Country','Salary_mean']]
df_top12=df_top12.pivot(index='Year', columns='Country', values='Salary_mean')
f, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(df_top12, ax=ax)
Now consider how the dependence of the average value of wages throughout the world on the floor. As we see, until 2017, women earned less than men. However, since 2017, this gap has disappeared, which is good news.
df_gender=df_all.query('Gender== "Male" or Gender=="Female"')
df_gender_salary=df_gender.groupby(['Year','Gender'], as_index=False)['Salary Midpoint'].mean()
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="Salary Midpoint", hue="Gender", data=df_gender_salary)
plt.xlabel("Year")
plt.ylabel("Salary Midpoint")
Consider the distribution of men and women in the last year by occupation.
df_occup_gender=df_all.groupby(['Year','Occupation','Gender']).size().reset_index(name='Counts')
df_occup_gender=df_occup_gender.query('(Gender== "Male" or Gender=="Female") and (Year==2019)')
df_occup_gender
sns.set(style="whitegrid")
g = sns.catplot(x="Counts", y="Occupation", hue="Gender", data=df_occup_gender, height=12, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("Occupation")
We see that in all areas of women at times less than men. In front-end team and academic-research groups, women are met more often than in other areas. On the contrary, as a rule, team leaders are always men.
df_occup_gender_diag=df_all.groupby(['Year','Gender']).size().reset_index(name='Counts')
df_occup_gender_diag=df_occup_gender_diag.query('(Gender== "Male" or Gender=="Female") and (Year==2019)')
sizes=list(df_occup_gender_diag['Counts'].values)
labels=list(df_occup_gender_diag['Gender'].values)
explode = (0, 0.1)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=False, startangle=90)
ax1.axis('equal')
plt.show()
As noted earlier, in 2019 women make up only 8% of the male respondents.
df_occup_salary=df_all.groupby(['Year','IT_Expirience'], as_index=False)['Salary Midpoint'].mean()
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="Salary Midpoint", hue="IT_Expirience", data=df_occup_salary)
plt.xlabel("Year")
plt.ylabel("Salary Midpoint")
The schedule is completely objective: with the growth of experience, the salary grows. No anomalies are noticed.
df_plot=df_all[df_all['Year']==2019]
df_plot=df_plot.groupby(['Occupation','IT_Expirience']).size().reset_index(name='Counts')
df_plot=df_plot.pivot(index='IT_Expirience', columns='Occupation', values='Counts')
a1=df_plot.values[0]
a2=df_plot.values[1]
a3=df_plot.values[2]
a4=df_plot.values[3]
x = df_plot.columns
fig = go.Figure()
fig.add_trace(go.Bar(x=x, y=a1, name='Expirience 0-2'))
fig.add_trace(go.Bar(x=x, y=a3, name='Expirience 3-5'))
fig.add_trace(go.Bar(x=x, y=a4, name='Expirience 6-10'))
fig.add_trace(go.Bar(x=x, y=a2, name='Expirience 11+'))
fig.update_layout(barmode='relative', title_text='Expirience/Occupation-Year 2019')
fig.show()
From the graph we see that in 2019 the most popular areas are: back-end, full-stack, data-science and front-end. As we see, to work in these areas it is desirable to have 2-years experience.
df_plottt=df_all[df_all['Year']==2019]
df_plottt=df_plottt.groupby(['IT_Expirience','Education']).size().reset_index(name='Counts')
df_plottt=df_plottt.pivot(index='Education', columns='IT_Expirience', values='Counts')
def get_proc(x):
x=np.array(x)
lenght=np.sum(x)
a=[]
for i in x:
i=i*100/lenght
i=round(i,1)
a.append(i)
return a
a1=df_plottt.values[0]
a2=df_plottt.values[1]
a3=df_plottt.values[2]
a4=df_plottt.values[3]
a5=df_plottt.values[4]
a6=df_plottt.values[5]
a7=df_plottt.values[6]
a8=df_plottt.values[7]
x_data=[a1,a2,a3,a4,a5,a6,a7,a8]
x_data=list(map(lambda x: get_proc(x) , x_data))
x_data
top_labels = ['Expirience 0-2', 'Expirience 3-5', 'Expirience 6-10', 'Expirience 11+']
colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
'rgba(190, 192, 213, 1)']
y_data = ['Bachelor in Computer Science',
'Doctor in Computer Science',
'Master in Computer Science', 'Online class',
'Online class + Bachelor in Computer Science',
'Online class + Doctor in Computer Science', 'Online class + Master in Computer Science', 'Self-taught']
fig = go.Figure()
for i in range(0, len(x_data[0])):
for xd, yd in zip(x_data, y_data):
fig.add_trace(go.Bar(
x=[xd[i]], y=[yd],
orientation='h',
marker=dict(
color=colors[i],
line=dict(color='rgb(248, 248, 249)', width=1)
)
))
fig.update_layout(
xaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
domain=[0.15, 1]
),
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
),
barmode='stack',
paper_bgcolor='rgb(248, 248, 255)',
plot_bgcolor='rgb(248, 248, 255)',
margin=dict(l=120, r=10, t=140, b=80),
showlegend=False,
)
annotations = []
for yd, xd in zip(y_data, x_data):
annotations.append(dict(xref='paper', yref='y',
x=0.14, y=yd,
xanchor='right',
text=str(yd),
font=dict(family='Arial', size=14,
color='rgb(67, 67, 67)'),
showarrow=False, align='right'))
annotations.append(dict(xref='x', yref='y',
x=xd[0] / 2, y=yd,
text=str(xd[0]) + '%',
font=dict(family='Arial', size=14,
color='rgb(248, 248, 255)'),
showarrow=False))
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=xd[0] / 2, y=1.1,
text=top_labels[0],
font=dict(family='Arial', size=14,
color='rgb(67, 67, 67)'),
showarrow=False))
space = xd[0]
for i in range(1, len(xd)):
annotations.append(dict(xref='x', yref='y',
x=space + (xd[i]/2), y=yd,
text=str(xd[i]) + '%',
font=dict(family='Arial', size=14,
color='rgb(248, 248, 255)'),
showarrow=False))
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=space + (xd[i]/2), y=1.1,
text=top_labels[i],
font=dict(family='Arial', size=14,
color='rgb(67, 67, 67)'),
showarrow=False))
space += xd[i]
fig.update_layout(annotations=annotations)
fig.show()
Now the IT market is dominated by people with 3-5 years of experience with varying degrees of education, as well as without education.
df_exp_age=df_all[df_all['Year']==2019]
df_exp_age=df_exp_age.groupby(['IT_Expirience','Age_range']).size().reset_index(name='Counts')
df_exp_age=df_exp_age.pivot(index='IT_Expirience', columns='Age_range', values='Counts')
df_exp_age
a1=df_exp_age.values[0]
a2=df_exp_age.values[1]
a3=df_exp_age.values[2]
a4=df_exp_age.values[3]
x_data=[a1,a2,a3,a4]
x_data=list(map(lambda x: get_proc(x) , x_data))
x_data
labels = ["20 - 24", "25 - 34", "35 - 44", "45 - 60", "< 20", "> 60"]
fig = make_subplots(rows=1, cols=4, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'},{'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=x_data[0], name="0-2"),
1, 1)
fig.add_trace(go.Pie(labels=labels, values=x_data[1], name="11+"),
1, 2)
fig.add_trace(go.Pie(labels=labels, values=x_data[2], name="3-5"),
1, 3)
fig.add_trace(go.Pie(labels=labels, values=x_data[3], name="6-10"),
1, 4)
fig.update_traces(hole=.4, hoverinfo="label+percent+name")
fig.update_layout(
title_text="Expirience/Age",
annotations=[dict(text='IT Expirience 0-2', x=0.0, y=1.0, font_size=20, showarrow=False),
dict(text='IT Expirience 11+', x=0.25, y=1.0, font_size=20, showarrow=False),
dict(text='IT Expirience 3-5', x=0.6, y=1.0, font_size=20, showarrow=False),
dict(text='IT Expirience 6-10', x=0.95, y=1.0, font_size=20, showarrow=False)])
fig.show()
A small number of people over 40 in IT. The main number - age from 25 to 34.
df_age_educ=df_all.copy()
df_age_educ=df_age_educ[df_age_educ['Year']==2019]
df_age_educ['Education']=df_age_educ['Education'].apply(lambda x:get_educ(x))
df_age_educ=df_age_educ.groupby(['Education','Age_range']).size().reset_index(name='counts')
df_age_educ=df_age_educ.pivot(index='Education', columns='Age_range', values='counts')
df_age_educ
a1=df_age_educ.values[0]
a2=df_age_educ.values[1]
a3=df_age_educ.values[2]
a4=df_age_educ.values[3]
x_data=[a1,a2,a3,a4]
x_data=list(map(lambda x: get_proc(x) , x_data))
x_data
labels = ["20 - 24", "25 - 34", "35 - 44", "45 - 60", "< 20", "> 60"]
fig = make_subplots(rows=1, cols=4, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'},{'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=x_data[0], name="Bachelor"),
1, 1)
fig.add_trace(go.Pie(labels=labels, values=x_data[1], name="Doctor"),
1, 2)
fig.add_trace(go.Pie(labels=labels, values=x_data[2], name="Master"),
1, 3)
fig.add_trace(go.Pie(labels=labels, values=x_data[3], name="Not Educated"),
1, 4)
fig.update_traces(hole=.4, hoverinfo="label+percent+name")
fig.update_layout(
title_text="Education/Age",
annotations=[dict(text='Bachelor in Computer Science', x=0.0, y=1.1, font_size=17, showarrow=False),
dict(text='Doctor in Computer Science', x=0.25, y=1.0, font_size=17, showarrow=False),
dict(text='Master in Computer Science', x=0.6, y=1.1, font_size=17, showarrow=False),
dict(text='Not Educated', x=0.95, y=1.0, font_size=17, showarrow=False)])
fig.show()
df_1=df_all.copy()
df_1['Education']=df_1['Education'].apply(lambda x:get_educ(x))
df_1=df_1.groupby(['Education','Year']).size().reset_index(name='counts')
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="counts", hue="Education", data=df_1)
plt.xlabel("Year")
plt.ylabel("Counts")
By 2019, the number of bachelors and people without education has become equal and occupies a leading position.
df_all.to_csv('pp.csv')
sf = SFrame.read_csv(url='pp.csv',verbose=False )
sf=sf[sf['Year']>2013]
sf_skills=sf.stack('Skills', new_column_name='Skills')
dd=sf_skills.to_dataframe()
sf_ide=sf.stack('IDE', new_column_name='IDE')
df_ide=sf_ide.to_dataframe()
dd
dd['Skills']=dd['Skills'].str.strip()
dd1=dd[dd['Year']==2019]
dd1=dd1.groupby(['Skills']).size().reset_index(name='counts')
dd1=dd1.sort_values(by=['counts'], ascending=False)
dd2=dd.groupby(['Skills','Year']).size().reset_index(name='counts')
dd1=dd1[:15]
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Skills', x='counts',orient="h", data=dd1, ax=ax)
We have compiled a list of the top 15 languages and technologies and reviewed their popularity over time.
top10=dd1['Skills'].array[:15]
df_filter = dd2['Skills'].isin(top10)
df_top10=dd2[df_filter]
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="counts", hue="Skills", data=df_top10)
plt.xlabel("Year")
plt.ylabel("Counts")
dd1=dd[dd['Year']==2019]
dd1=dd1.groupby(['Skills'], as_index=False)['Salary Midpoint'].mean()
dd1=dd1.sort_values(by=['Salary Midpoint'], ascending=False)
dd2=dd.groupby(['Skills','Year'], as_index=False)['Salary Midpoint'].mean()
dd1=dd1[:15]
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Skills', x='Salary Midpoint',orient="h", data=dd1, ax=ax)
We have compiled a list of the top 15 languages and technologies by wages and analyzed by time. In 2019, languages and technologies that allow working with Big Data are great value.
top10=dd1['Skills'].array[:15]
df_filter = dd2['Skills'].isin(top10)
df_top10=dd2[df_filter]
sns.set()
plt.figure(figsize = (16, 8), facecolor = None)
ax = sns.lineplot(x="Year", y="Salary Midpoint", hue="Skills", data=df_top10)
plt.xlabel("Year")
plt.ylabel("Counts")
df_plotly=dd[dd['Year']==2019]
df_plotly=df_plotly[['Salary Midpoint','Occupation','Skills']].dropna()
df_plotly=df_plotly.groupby(['Occupation','Skills'], as_index=False).agg({'Salary Midpoint': ['mean','size']})
df_plotly.columns=['Occupation','Skills','Salary mean','Count']
px.scatter(df_plotly, x="Occupation", y="Salary mean", color="Skills", size='Count', size_max=50,width=1200,height=600)
We considered which languages and technologies use people of different professions. And the dependence of the wages of employees of these professions on the knowledge of a particular language.
def counter(x):
if isinstance(x,list):
return int(len(x))
if isinstance(x,str):
return int(1)
else:
return None
df_all1=df_all[df_all['Year']==2019]
df_all1['Count skills']=df_all1['Skills'].dropna()
df_all1['Count skills']=df_all1['Count skills'].apply(lambda x:counter(x))
x=df_all1['Count skills']
sns.distplot(x);
df_all1=df_all[df_all['Year']==2019]
df_all1=df_all1[['Skills','Salary Midpoint']].dropna()
df_all1['Count skills']=df_all1['Skills'].apply(lambda x:counter(x))
x=df_all1['Count skills']
y=df_all1['Salary Midpoint']
sns.jointplot(x, y, kind="kde", xlim=(0,25), ylim=(0,175000))
In each segment of salaries, it is necessary and sufficient to own 8-10 languages / technologies.
wordcloud = WordCloud(width = 500, height = 500,
background_color ='black',
min_font_size = 10)
df_ide1=df_ide['IDE'].dropna().str.strip()
c = Counter(list(df_ide1.values))
wordcloud.generate_from_frequencies(frequencies=c)
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
We made an IDE frequency tag cloud for fun.
Conclusions
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir("/content/gdrive/My Drive")
os.getcwd()
!pip install turicreate
!pip install plotly_express
# !pip install pycountry
del os.environ['LC_ALL']